source('../settings/settings.R')
source('commonFunctions.R')
library(nlme)
library(lme4)
set.seed(43)
inputFileDrive1 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=1, distPrev=DISTANCE_PREV, distNext=DISTANCE_NEXT))
inputFileDrive2 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=2, distPrev=DISTANCE_PREV, distNext=DISTANCE_NEXT))
inputFileDrive3 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=3, distPrev=DISTANCE_PREV, distNext=DISTANCE_NEXT))
inputFileDrive4 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=4, distPrev=30, distNext=30))
drive1 <- read.csv(inputFileDrive1)
drive2 <- read.csv(inputFileDrive2)
drive3 <- read.csv(inputFileDrive3)
drive4 <- read.csv(inputFileDrive4, stringsAsFactors = T)
dfSeg <- data.frame(rep(1, nrow(drive4)), rep(2, nrow(drive4)), rep(3, nrow(drive4)), rep(4, nrow(drive4)))
names(dfSeg) <- c("Seg1", "Seg2", "Seg3", "Seg4")
combinedDf_Seg1 <- cbind(drive4,
drive1$MeanPP_Seg0,
drive2$MeanPP_Seg1, drive3$MeanPP_Seg1,
drive2$MeanPP_Seg0_1, drive3$MeanPP_Seg0_1,
drive2$StdPP_Seg1, drive3$StdPP_Seg1,
drive2$StdPP_Seg0_1, drive3$StdPP_Seg0_1,
drive2$MeanPP_AccHigh1, drive3$MeanPP_AccHigh1,
drive2$X.MeanPP_AccLow1, drive3$X.MeanPP_AccLow1,
drive2$StdPP_AccHigh1, drive3$StdPP_AccHigh1,
drive2$StdPP_AccLow1, drive3$StdPP_AccLow1,
dfSeg$Seg1
)
combinedDf_Seg2 <- cbind(drive4,
drive1$MeanPP_Seg0,
drive2$MeanPP_Seg2, drive3$MeanPP_Seg2,
drive2$MeanPP_Seg0_2, drive3$MeanPP_Seg0_2,
drive2$StdPP_Seg2, drive3$StdPP_Seg2,
drive2$StdPP_Seg0_2, drive3$StdPP_Seg0_2,
drive2$MeanPP_AccHigh2, drive3$MeanPP_AccHigh2,
drive2$X.MeanPP_AccLow2, drive3$X.MeanPP_AccLow2,
drive2$StdPP_AccHigh2, drive3$StdPP_AccHigh2,
drive2$StdPP_AccLow2, drive3$StdPP_AccLow2,
dfSeg$Seg2
)
combinedDf_Seg3 <- cbind(drive4,
drive1$MeanPP_Seg0,
drive2$MeanPP_Seg3, drive3$MeanPP_Seg3,
drive2$MeanPP_Seg0_3, drive3$MeanPP_Seg0_3,
drive2$StdPP_Seg3, drive3$StdPP_Seg3,
drive2$StdPP_Seg0_3, drive3$StdPP_Seg0_3,
drive2$MeanPP_AccHigh3, drive3$MeanPP_AccHigh3,
drive2$X.MeanPP_AccLow3, drive3$X.MeanPP_AccLow3,
drive2$StdPP_AccHigh3, drive3$StdPP_AccHigh3,
drive2$StdPP_AccLow3, drive3$StdPP_AccLow3,
dfSeg$Seg3
)
combinedDf_Seg4 <- cbind(drive4,
drive1$MeanPP_Seg0,
drive2$MeanPP_Seg4, drive3$MeanPP_Seg4,
drive2$MeanPP_Seg0_4, drive3$MeanPP_Seg0_4,
drive2$StdPP_Seg4, drive3$StdPP_Seg4,
drive2$StdPP_Seg0_4, drive3$StdPP_Seg0_4,
drive2$MeanPP_AccHigh4, drive3$MeanPP_AccHigh4,
drive2$X.MeanPP_AccLow4, drive3$X.MeanPP_AccLow4,
drive2$StdPP_AccHigh4, drive3$StdPP_AccHigh4,
drive2$StdPP_AccLow4, drive3$StdPP_AccLow4,
dfSeg$Seg4
)
common_names <- c("PP_Dev_1_Turning",
"PP_Dev_2_Straight", "PP_Dev_3_Straight",
"PP_Dev_2_Turning", "PP_Dev_3_Turning",
"Std_PP_2_Straight", "Std_PP_3_Straight",
"Std_PP_2_Turning", "Std_PP_3_Turning",
"Mean_PP_2_AccHigh", "Mean_PP_3_AccHigh",
"Mean_PP_2_AccLow", "Mean_PP_3_AccLow",
"Std_PP_2_AccHigh", "Std_PP_3_AccHigh",
"Std_PP_2_AccLow", "Std_PP_3_AccLow",
"Segment")
names(combinedDf_Seg1) <- c(names(drive4), common_names)
names(combinedDf_Seg2) <- c(names(drive4), common_names)
names(combinedDf_Seg3) <- c(names(drive4), common_names)
names(combinedDf_Seg4) <- c(names(drive4), common_names)
# combinedDf_Seg1$Subject <- paste0(as.factor(combinedDf_Seg1$Subject), ".S1")
# combinedDf_Seg2$Subject <- paste0(as.factor(combinedDf_Seg2$Subject), ".S2")
# combinedDf_Seg3$Subject <- paste0(as.factor(combinedDf_Seg3$Subject), ".S3")
# combinedDf_Seg4$Subject <- paste0(as.factor(combinedDf_Seg4$Subject), ".S4")
combinedDf <- rbind(combinedDf_Seg1, combinedDf_Seg2, combinedDf_Seg3, combinedDf_Seg4)
# combinedDf$Subject <- paste0("#", str_pad(combinedDf$Subject, 2, pad="0"))
combinedDf$Segment <- as.factor(combinedDf$Segment)
combinedDf$ActivityEncoded <- factor(ifelse(combinedDf$Activity == "NO", "1", ifelse(combinedDf$Activity == "C", "2", "3")))
combinedDf <- combinedDf[complete.cases(combinedDf),]
combinedDf$Subject = as.factor(combinedDf$Subject)
model = lm(PP_After ~
PP_Dev_2_Straight +
PP_Dev_3_Straight +
PP_Dev_2_Turning +
PP_Dev_3_Turning +
Std_PP_2_Straight +
Std_PP_3_Straight +
Std_PP_2_Turning +
Std_PP_3_Turning +
# PP_Prior +
factor(ActivityEncoded),
data=combinedDf, random = ~1|factor(Subject), method = "REML")
method = 'REML' is not supported. Using 'qr'In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument ‘random’ will be disregarded
# anova(model)
summary(model)
Call:
lm(formula = PP_After ~ PP_Dev_2_Straight + PP_Dev_3_Straight +
PP_Dev_2_Turning + PP_Dev_3_Turning + Std_PP_2_Straight +
Std_PP_3_Straight + Std_PP_2_Turning + Std_PP_3_Turning +
factor(ActivityEncoded), data = combinedDf, method = "REML",
random = ~1 | factor(Subject))
Residuals:
Min 1Q Median 3Q Max
-0.17764 -0.07137 0.00092 0.05144 0.34502
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.02294 0.04870 -0.471 0.6393
PP_Dev_2_Straight 0.68712 0.15583 4.410 4.45e-05 ***
PP_Dev_3_Straight -0.37734 0.23618 -1.598 0.1155
PP_Dev_2_Turning -0.22015 0.14683 -1.499 0.1391
PP_Dev_3_Turning 0.38632 0.22101 1.748 0.0857 .
Std_PP_2_Straight 0.02144 0.38757 0.055 0.9561
Std_PP_3_Straight 0.50369 0.36818 1.368 0.1765
Std_PP_2_Turning -0.08343 0.50205 -0.166 0.8686
Std_PP_3_Turning -0.95800 0.60836 -1.575 0.1207
factor(ActivityEncoded)2 0.08647 0.03285 2.633 0.0108 *
factor(ActivityEncoded)3 0.17032 0.03184 5.349 1.51e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0993 on 59 degrees of freedom
Multiple R-squared: 0.5613, Adjusted R-squared: 0.4869
F-statistic: 7.548 on 10 and 59 DF, p-value: 1.338e-07
plot(model)




No Random Effects
linearModel1 <- lm(PP_After ~
Mean_PP_2_AccHigh
+ Mean_PP_2_AccLow
+ Mean_PP_3_AccHigh
+ Mean_PP_3_AccLow
+ Std_PP_2_AccHigh
+ Std_PP_2_AccLow
+ Std_PP_3_AccHigh
+ Std_PP_3_AccLow
# + PP_Prior
+ factor(ActivityEncoded),
data=combinedDf)
# anova(model)
summary(linearModel1)
Call:
lm(formula = PP_After ~ Mean_PP_2_AccHigh + Mean_PP_2_AccLow +
Mean_PP_3_AccHigh + Mean_PP_3_AccLow + Std_PP_2_AccHigh +
Std_PP_2_AccLow + Std_PP_3_AccHigh + Std_PP_3_AccLow + factor(ActivityEncoded),
data = combinedDf)
Residuals:
Min 1Q Median 3Q Max
-0.143483 -0.066038 -0.007552 0.051595 0.300313
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.06552 0.03640 -1.800 0.07695 .
Mean_PP_2_AccHigh 1.63376 0.50590 3.229 0.00203 **
Mean_PP_2_AccLow -1.15253 0.49763 -2.316 0.02405 *
Mean_PP_3_AccHigh 0.74436 0.31520 2.362 0.02152 *
Mean_PP_3_AccLow -0.66142 0.33913 -1.950 0.05589 .
Std_PP_2_AccHigh -1.28974 1.45755 -0.885 0.37982
Std_PP_2_AccLow 0.79583 1.15728 0.688 0.49435
Std_PP_3_AccHigh 0.15278 0.94374 0.162 0.87195
Std_PP_3_AccLow 0.91693 0.83810 1.094 0.27838
factor(ActivityEncoded)2 0.08724 0.03041 2.869 0.00571 **
factor(ActivityEncoded)3 0.14810 0.03043 4.867 8.83e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09353 on 59 degrees of freedom
Multiple R-squared: 0.6108, Adjusted R-squared: 0.5448
F-statistic: 9.259 on 10 and 59 DF, p-value: 5.37e-09
plot(linearModel1)




With Random Effects
linearModel1 <- lmer(PP_After ~
(1 | Subject)
+ Mean_PP_2_AccHigh
+ Mean_PP_2_AccLow
+ Mean_PP_3_AccHigh
+ Mean_PP_3_AccLow
+ Std_PP_2_AccHigh
+ Std_PP_2_AccLow
+ Std_PP_3_AccHigh
+ Std_PP_3_AccLow,
# + factor(ActivityEncoded),
data=combinedDf, REML = T)
Model failed to converge with max|grad| = 0.0670118 (tol = 0.002, component 1)Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?Model may not have converged with 1 eigenvalue close to zero: 1.1e-10
# anova(model)
summary(linearModel1)
Linear mixed model fit by REML. t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: PP_After ~ (1 | Subject) + Mean_PP_2_AccHigh + Mean_PP_2_AccLow +
Mean_PP_3_AccHigh + Mean_PP_3_AccLow + Std_PP_2_AccHigh + Std_PP_2_AccLow + Std_PP_3_AccHigh + Std_PP_3_AccLow
Data: combinedDf
REML criterion at convergence: -1503.4
Scaled residuals:
Min 1Q Median 3Q Max
-4.063e-07 -1.190e-07 -6.100e-09 1.335e-07 6.294e-07
Random effects:
Groups Name Variance Std.Dev.
Subject (Intercept) 7.124e-03 8.440e-02
Residual 2.068e-17 4.548e-09
Number of obs: 70, groups: Subject, 21
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 8.333e-02 2.203e-02 7.891e-05 3.783 0.999
Mean_PP_2_AccHigh 6.340e-14 5.153e-08 7.891e-05 0.000 1.000
Mean_PP_2_AccLow -3.925e-14 3.923e-08 7.891e-05 0.000 1.000
Mean_PP_3_AccHigh -1.859e-15 1.820e-08 7.891e-05 0.000 1.000
Mean_PP_3_AccLow 6.079e-15 2.337e-08 7.891e-05 0.000 1.000
Std_PP_2_AccHigh 1.459e-14 8.442e-08 7.891e-05 0.000 1.000
Std_PP_2_AccLow 1.525e-16 6.425e-08 7.891e-05 0.000 1.000
Std_PP_3_AccHigh -2.153e-14 5.689e-08 7.891e-05 0.000 1.000
Std_PP_3_AccLow 2.408e-14 4.744e-08 7.891e-05 0.000 1.000
Correlation of Fixed Effects:
(Intr) M_PP_2_AH M_PP_2_AL M_PP_3_AH M_PP_3_AL S_PP_2_AH S_PP_2_AL S_PP_3_AH
Mn_PP_2_AcH 0.000
Mn_PP_2_AcL 0.000 -0.960
Mn_PP_3_AcH 0.000 0.005 -0.029
Mn_PP_3_AcL 0.000 -0.055 0.015 -0.697
Std_PP_2_AH 0.000 0.147 -0.186 0.088 -0.073
Std_PP_2_AL 0.000 0.030 -0.013 -0.131 0.089 -0.877
Std_PP_3_AH 0.000 0.063 -0.115 -0.453 0.312 -0.339 0.190
Std_PP_3_AL 0.000 0.205 -0.222 0.376 -0.327 0.307 -0.236 -0.528
convergence code: 0
Model failed to converge with max|grad| = 0.0670118 (tol = 0.002, component 1)
Model is nearly unidentifiable: very large eigenvalue
- Rescale variables?
plot(linearModel1)

linearModel1 <- lm(PP_After ~
Mean_PP_2_AccHigh
+ Mean_PP_2_AccLow
+ Mean_PP_3_AccHigh
+ Mean_PP_3_AccLow
# + PP_Prior
+ factor(ActivityEncoded),
data=combinedDf)
# anova(model)
summary(linearModel1)
Call:
lm(formula = PP_After ~ Mean_PP_2_AccHigh + Mean_PP_2_AccLow +
Mean_PP_3_AccHigh + Mean_PP_3_AccLow + factor(ActivityEncoded),
data = combinedDf)
Residuals:
Min 1Q Median 3Q Max
-0.14161 -0.06916 -0.00742 0.04809 0.30760
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.05248 0.02751 -1.907 0.06103 .
Mean_PP_2_AccHigh 1.64939 0.38841 4.246 7.28e-05 ***
Mean_PP_2_AccLow -1.14578 0.37518 -3.054 0.00331 **
Mean_PP_3_AccHigh 0.65161 0.26162 2.491 0.01540 *
Mean_PP_3_AccLow -0.60534 0.28421 -2.130 0.03709 *
factor(ActivityEncoded)2 0.08906 0.02976 2.993 0.00394 **
factor(ActivityEncoded)3 0.15955 0.02892 5.516 6.92e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.09303 on 63 degrees of freedom
Multiple R-squared: 0.5888, Adjusted R-squared: 0.5496
F-statistic: 15.03 on 6 and 63 DF, p-value: 1.372e-10
plot(linearModel1)




Machine Learning
ppAfter <- combinedDf$PP_After
ppAfterArray <- matrix(ppAfter, nrow = 1,ncol = length(ppAfter))
thresholdPPAfter <- otsu(ppAfterArray, range=c(min(ppAfter), max(ppAfter))) # Expected Threshold > 0.10123
print(paste0('Threshold: ', thresholdPPAfter))
[1] "Threshold: 0.101235546875"
selectedDf <- combinedDf %>% select(
"Subject", "Activity", "PP_After", # "PP_Prior",
"Mean_PP_2_AccHigh", "Mean_PP_3_AccHigh",
"Mean_PP_2_AccLow", "Mean_PP_3_AccLow",
"Std_PP_2_AccHigh", "Std_PP_3_AccHigh",
"Std_PP_2_AccLow", "Std_PP_3_AccLow")
selectedDf$Subject <- NULL
selectedDf$Activity_NO <- ifelse(selectedDf$Activity == "NO", 1, 0)
selectedDf$Activity_C <- ifelse(selectedDf$Activity == "C", 1, 0)
selectedDf$Activity_M <- ifelse(selectedDf$Activity == "M", 1, 0)
selectedDf$Activity <- NULL
# selectedDf$PP_Dev_1_Turning <- NULL
# selectedDf$Std_PP_2_Straight <- NULL
# selectedDf$Std_PP_2_Turning <- NULL
# selectedDf$Std_PP_3_Straight <- NULL
# selectedDf$Std_PP_3_Turning <- NULL
#
# # According to Linear model
# selectedDf$PP_Dev_2_Straight <- abs(selectedDf$PP_Dev_2_Straight)
# selectedDf$PP_Dev_3_Straight <- abs(selectedDf$PP_Dev_3_Straight)
# selectedDf$PP_Dev_2_Turning <- abs(selectedDf$PP_Dev_2_Turning)
# selectedDf$PP_Dev_3_Turning <- abs(selectedDf$PP_Dev_3_Turning)
# selectedDf$PP_Prior <- abs(selectedDf$PP_Prior) # NULL
selectedDf$Class <- ifelse(selectedDf$PP_After >= thresholdPPAfter, T, F)
selectedDf$PP_After <- NULL
print(names(selectedDf))
[1] "Mean_PP_2_AccHigh" "Mean_PP_3_AccHigh" "Mean_PP_2_AccLow" "Mean_PP_3_AccLow" "Std_PP_2_AccHigh" "Std_PP_3_AccHigh"
[7] "Std_PP_2_AccLow" "Std_PP_3_AccLow" "Activity_NO" "Activity_C" "Activity_M" "Class"
# library(mefa)
# combinedDf <- rep(combinedDf, 10)
set.seed(39)
n_folds <- 10
params <- param <- list(objective = "binary:logistic",
booster = "gbtree",
eval_metric = "auc",
eta = 0.1,
max_depth = 10,
alpha = 1,
lambda = 0,
gamma = 0.45,
min_child_weight = 0.3,
subsample = 1,
colsample_bytree = 1)
# XGBoost Model
xgb_m <- xgb.cv( params = param,
data = as.matrix(selectedDf %>% select(-Class)) ,
label = selectedDf$Class,
nrounds = 100,
verbose = F,
prediction = T,
maximize = F, # Change this value to F will help to run with more itineration
nfold = n_folds,
metrics = c("auc", "error"),
early_stopping_rounds = 50,
stratified = T,
scale_pos_weight = 1)
# xgb_m$evaluation_log[xgb_m$best_iteration,"test_auc_mean"]
xgb_m$evaluation_log[xgb_m$best_iteration,]
NA
Performance Metrics
# Prediction
selectedDf$clsPred <- round(xgb_m$pred)
computePerformanceResults <- function(sdat){
sdat = sdat[complete.cases(sdat),]
acc = sum(sdat[,1] == sdat[,2])/nrow(sdat)
conf_mat = table(sdat)
specif = conf_mat[1,1]/sum(conf_mat[,1])
sensiv = conf_mat[2,2]/sum(conf_mat[,2])
preci = conf_mat[2,2]/sum(conf_mat[2,])
npv = conf_mat[1,1]/sum(conf_mat[1,])
return(c(acc,specif,sensiv,preci,npv))
}
# Get average performance
performance <- computePerformanceResults(selectedDf %>% select(Class, clsPred))
acc <- performance[1]
prec <- performance[4]
recall <- performance[3]
spec <- performance[2]
npv <- performance[5]
f1 <- (2 * recall * prec) / (recall + prec)
auc <- as.numeric(xgb_m$evaluation_log[xgb_m$best_iteration, "test_auc_mean"])
print(paste("Accuracy=", round(acc, 2)))
[1] "Accuracy= 0.81"
print(paste("Precision=", round(prec, 2)))
[1] "Precision= 0.7"
print(paste("Recall=", round(recall, 2)))
[1] "Recall= 0.84"
print(paste("Specificity=", round(spec, 2)))
[1] "Specificity= 0.8"
print(paste("NPV=", round(npv, 2)))
[1] "NPV= 0.9"
print(paste("F1=", round(f1, 2)))
[1] "F1= 0.76"
print(paste("AUC=", round(auc, 2)))
[1] "AUC= 0.9"
# Importance
bst <- xgboost( params = param,
data = as.matrix(selectedDf %>% select(-c(Class, clsPred))) ,
label = selectedDf$Class,
nrounds = 100,
verbose = F,
prediction = T,
maximize = F, # Change this value to F will help to run with more itineration
nfold = n_folds,
metrics = c("auc", "error"),
early_stopping_rounds = 50,
stratified = T,
scale_pos_weight = 1)
importanceDf <- xgb.importance(colnames(selectedDf %>% select(-c(Class, clsPred))), model = bst)
print(importanceDf)
library(pROC)
dfROC <- pROC::roc(response = ifelse(selectedDf$Class==T, 1, 0),
predictor = round(xgb_m$pred),
levels=c(0, 1), direction = "<")
# it = which.max(xgb_m$evaluation_log$test_auc_mean)
# best.iter = xgb_m$evaluation_log$iter[it]
# best.iter
plot(pROC::roc(response = ifelse(selectedDf$Class==T, 1, 0),
predictor = round(xgb_m$pred),
levels=c(0, 1), direction = "<"),
legacy.axes = TRUE,
main="ROC Curve",
lwd=1.5)

Plot feature importance
yAxis <- list(
title = 'Importance',
range=c(0.0, 1.0)
)
xAxis <- list(
title = ''
)
importanceDf$Feature <- factor(importanceDf$Feature, levels = importanceDf[order(-Gain),]$Feature)
fig_Importance <- plot_ly(importanceDf, x = ~Feature, y = ~Gain, type = 'bar', name = 'Gain', width=600) %>%
add_trace(y = ~Cover, name = 'Cover') %>%
add_trace(y = ~Frequency, name = 'Frequency') %>%
layout(yaxis = yAxis, xaxis=xAxis, barmode = 'group', title="Feature Importance") %>%
config(.Last.value, mathjax = 'cdn')
htmltools::tagList(fig_Importance)
---
title: "R Notebook"
output: html_notebook
---

```{r}
source('../settings/settings.R')
source('commonFunctions.R')
library(nlme)
library(lme4)
```

```{r}
set.seed(43)
inputFileDrive1 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=1, distPrev=DISTANCE_PREV, distNext=DISTANCE_NEXT))
inputFileDrive2 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=2, distPrev=DISTANCE_PREV, distNext=DISTANCE_NEXT))
inputFileDrive3 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=3, distPrev=DISTANCE_PREV, distNext=DISTANCE_NEXT))
inputFileDrive4 <- str_interp("../data/processed/analysis/TT1_Drive_${drive}_PP_${distPrev}m_${distNext}m.csv", list(drive=4, distPrev=30, distNext=30))

drive1 <- read.csv(inputFileDrive1)
drive2 <- read.csv(inputFileDrive2)
drive3 <- read.csv(inputFileDrive3)

drive4 <- read.csv(inputFileDrive4, stringsAsFactors = T)
```

```{r}
dfSeg <- data.frame(rep(1, nrow(drive4)), rep(2, nrow(drive4)), rep(3, nrow(drive4)), rep(4, nrow(drive4)))
names(dfSeg) <- c("Seg1", "Seg2", "Seg3", "Seg4")

combinedDf_Seg1 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg1, drive3$MeanPP_Seg1, 
                    drive2$MeanPP_Seg0_1, drive3$MeanPP_Seg0_1,
                    drive2$StdPP_Seg1, drive3$StdPP_Seg1,
                    drive2$StdPP_Seg0_1, drive3$StdPP_Seg0_1,
                    drive2$MeanPP_AccHigh1, drive3$MeanPP_AccHigh1,
                    drive2$X.MeanPP_AccLow1, drive3$X.MeanPP_AccLow1,
                    drive2$StdPP_AccHigh1, drive3$StdPP_AccHigh1,
                    drive2$StdPP_AccLow1, drive3$StdPP_AccLow1,
                    dfSeg$Seg1
                  )
combinedDf_Seg2 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg2, drive3$MeanPP_Seg2, 
                    drive2$MeanPP_Seg0_2, drive3$MeanPP_Seg0_2,
                    drive2$StdPP_Seg2, drive3$StdPP_Seg2,
                    drive2$StdPP_Seg0_2, drive3$StdPP_Seg0_2,
                    drive2$MeanPP_AccHigh2, drive3$MeanPP_AccHigh2,
                    drive2$X.MeanPP_AccLow2, drive3$X.MeanPP_AccLow2,
                    drive2$StdPP_AccHigh2, drive3$StdPP_AccHigh2,
                    drive2$StdPP_AccLow2, drive3$StdPP_AccLow2,
                    dfSeg$Seg2
                  )
combinedDf_Seg3 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg3, drive3$MeanPP_Seg3, 
                    drive2$MeanPP_Seg0_3, drive3$MeanPP_Seg0_3,
                    drive2$StdPP_Seg3, drive3$StdPP_Seg3,
                    drive2$StdPP_Seg0_3, drive3$StdPP_Seg0_3,
                    drive2$MeanPP_AccHigh3, drive3$MeanPP_AccHigh3,
                    drive2$X.MeanPP_AccLow3, drive3$X.MeanPP_AccLow3,
                    drive2$StdPP_AccHigh3, drive3$StdPP_AccHigh3,
                    drive2$StdPP_AccLow3, drive3$StdPP_AccLow3,
                    dfSeg$Seg3
                  )
combinedDf_Seg4 <- cbind(drive4, 
                    drive1$MeanPP_Seg0, 
                    drive2$MeanPP_Seg4, drive3$MeanPP_Seg4, 
                    drive2$MeanPP_Seg0_4, drive3$MeanPP_Seg0_4,
                    drive2$StdPP_Seg4, drive3$StdPP_Seg4,
                    drive2$StdPP_Seg0_4, drive3$StdPP_Seg0_4,
                    drive2$MeanPP_AccHigh4, drive3$MeanPP_AccHigh4,
                    drive2$X.MeanPP_AccLow4, drive3$X.MeanPP_AccLow4,
                    drive2$StdPP_AccHigh4, drive3$StdPP_AccHigh4,
                    drive2$StdPP_AccLow4, drive3$StdPP_AccLow4,
                    dfSeg$Seg4
                  )

common_names <- c("PP_Dev_1_Turning",
                  "PP_Dev_2_Straight", "PP_Dev_3_Straight", 
                  "PP_Dev_2_Turning", "PP_Dev_3_Turning", 
                  "Std_PP_2_Straight", "Std_PP_3_Straight", 
                  "Std_PP_2_Turning", "Std_PP_3_Turning",
                  
                  "Mean_PP_2_AccHigh", "Mean_PP_3_AccHigh",
                  "Mean_PP_2_AccLow", "Mean_PP_3_AccLow",
                  "Std_PP_2_AccHigh", "Std_PP_3_AccHigh",
                  "Std_PP_2_AccLow", "Std_PP_3_AccLow",
                  
                  "Segment")

names(combinedDf_Seg1) <- c(names(drive4), common_names)
names(combinedDf_Seg2) <- c(names(drive4), common_names)
names(combinedDf_Seg3) <- c(names(drive4), common_names)
names(combinedDf_Seg4) <- c(names(drive4), common_names)

# combinedDf_Seg1$Subject <- paste0(as.factor(combinedDf_Seg1$Subject), ".S1")
# combinedDf_Seg2$Subject <- paste0(as.factor(combinedDf_Seg2$Subject), ".S2")
# combinedDf_Seg3$Subject <- paste0(as.factor(combinedDf_Seg3$Subject), ".S3")
# combinedDf_Seg4$Subject <- paste0(as.factor(combinedDf_Seg4$Subject), ".S4")

combinedDf <- rbind(combinedDf_Seg1, combinedDf_Seg2, combinedDf_Seg3, combinedDf_Seg4)

# combinedDf$Subject <- paste0("#", str_pad(combinedDf$Subject, 2, pad="0"))
combinedDf$Segment <- as.factor(combinedDf$Segment)
combinedDf$ActivityEncoded <- factor(ifelse(combinedDf$Activity == "NO", "1", ifelse(combinedDf$Activity == "C", "2", "3")))

combinedDf <- combinedDf[complete.cases(combinedDf),]
combinedDf$Subject = as.factor(combinedDf$Subject)
```


```{r}
model = lm(PP_After ~ 
              PP_Dev_2_Straight + 
              PP_Dev_3_Straight +
              PP_Dev_2_Turning + 
              PP_Dev_3_Turning + 
              Std_PP_2_Straight + 
              Std_PP_3_Straight + 
              Std_PP_2_Turning +
              Std_PP_3_Turning +
              # PP_Prior +
              factor(ActivityEncoded),
            data=combinedDf, random = ~1|factor(Subject), method = "REML")

# anova(model)
summary(model)
plot(model)
```

# No Random Effects
```{r}
linearModel1 <- lm(PP_After ~ 
                Mean_PP_2_AccHigh
              + Mean_PP_2_AccLow
              + Mean_PP_3_AccHigh
              + Mean_PP_3_AccLow
              + Std_PP_2_AccHigh
              + Std_PP_2_AccLow
              + Std_PP_3_AccHigh
              + Std_PP_3_AccLow
              # + PP_Prior
              + factor(ActivityEncoded),
            data=combinedDf)

# anova(model)
summary(linearModel1)
plot(linearModel1)
```


# With Random Effects
```{r}
linearModel1 <- lmer(PP_After ~ 
                (1 | Subject)
              + Mean_PP_2_AccHigh
              + Mean_PP_2_AccLow
              + Mean_PP_3_AccHigh
              + Mean_PP_3_AccLow
              + Std_PP_2_AccHigh
              + Std_PP_2_AccLow
              + Std_PP_3_AccHigh
              + Std_PP_3_AccLow,
              # + factor(ActivityEncoded),
            data=combinedDf, REML = T)

# anova(model)
summary(linearModel1)
plot(linearModel1)
```


```{r}
linearModel1 <- lm(PP_After ~ 
                Mean_PP_2_AccHigh
              + Mean_PP_2_AccLow
              + Mean_PP_3_AccHigh
              + Mean_PP_3_AccLow
              # + PP_Prior
              + factor(ActivityEncoded), 
            data=combinedDf)

# anova(model)
summary(linearModel1)
plot(linearModel1)
```

## Machine Learning

```{r}
ppAfter <- combinedDf$PP_After
ppAfterArray <- matrix(ppAfter, nrow = 1,ncol = length(ppAfter))
  
thresholdPPAfter <- otsu(ppAfterArray, range=c(min(ppAfter), max(ppAfter))) # Expected Threshold > 0.10123
print(paste0('Threshold: ', thresholdPPAfter))

selectedDf <- combinedDf %>% select(
                  "Subject", "Activity", "PP_After", # "PP_Prior",
                  "Mean_PP_2_AccHigh", "Mean_PP_3_AccHigh",
                  "Mean_PP_2_AccLow", "Mean_PP_3_AccLow",
                  "Std_PP_2_AccHigh", "Std_PP_3_AccHigh",
                  "Std_PP_2_AccLow", "Std_PP_3_AccLow")

selectedDf$Subject <- NULL
selectedDf$Activity_NO <- ifelse(selectedDf$Activity == "NO", 1, 0)
selectedDf$Activity_C <- ifelse(selectedDf$Activity == "C", 1, 0)
selectedDf$Activity_M <- ifelse(selectedDf$Activity == "M", 1, 0)
selectedDf$Activity <- NULL

# selectedDf$PP_Dev_1_Turning <- NULL
# selectedDf$Std_PP_2_Straight <- NULL
# selectedDf$Std_PP_2_Turning <- NULL
# selectedDf$Std_PP_3_Straight <- NULL
# selectedDf$Std_PP_3_Turning <- NULL
# 
# # According to Linear model
# selectedDf$PP_Dev_2_Straight <- abs(selectedDf$PP_Dev_2_Straight)
# selectedDf$PP_Dev_3_Straight <- abs(selectedDf$PP_Dev_3_Straight)
# selectedDf$PP_Dev_2_Turning <- abs(selectedDf$PP_Dev_2_Turning)
# selectedDf$PP_Dev_3_Turning <- abs(selectedDf$PP_Dev_3_Turning)
# selectedDf$PP_Prior <- abs(selectedDf$PP_Prior) # NULL

selectedDf$Class <- ifelse(selectedDf$PP_After >= thresholdPPAfter, T, F)
selectedDf$PP_After <- NULL

print(names(selectedDf))
```

```{r}
# library(mefa)
# combinedDf <- rep(combinedDf, 10) 
```

```{r}
set.seed(39)
n_folds <- 10
params <- param <- list(objective       = "binary:logistic", 
               booster          = "gbtree",
               eval_metric      = "auc",
               eta              = 0.1,
               max_depth        = 10,
               alpha            = 1,
               lambda           = 0,
               gamma            = 0.45,
               min_child_weight = 0.3,
               subsample        = 1,
               colsample_bytree = 1)
           
# XGBoost Model         
xgb_m <- xgb.cv(   params               = param,
                  data = as.matrix(selectedDf %>% select(-Class)) ,
                  label =  selectedDf$Class,
                  nrounds             = 100,
                  verbose             = F,
                  prediction          = T,
                  maximize            = F, # Change this value to F will help to run with more itineration
                  nfold               = n_folds,
                  metrics             = c("auc", "error"),
                  early_stopping_rounds = 50,
                  stratified            = T,
                  scale_pos_weight      = 1)

# xgb_m$evaluation_log[xgb_m$best_iteration,"test_auc_mean"]
xgb_m$evaluation_log[xgb_m$best_iteration,]

```

## Performance Metrics
```{r}
# Prediction
selectedDf$clsPred <- round(xgb_m$pred)

computePerformanceResults <- function(sdat){
  sdat = sdat[complete.cases(sdat),]
  acc = sum(sdat[,1] == sdat[,2])/nrow(sdat)
  conf_mat = table(sdat)
  specif = conf_mat[1,1]/sum(conf_mat[,1])
  sensiv = conf_mat[2,2]/sum(conf_mat[,2])
  preci =  conf_mat[2,2]/sum(conf_mat[2,])
  npv =    conf_mat[1,1]/sum(conf_mat[1,])
  return(c(acc,specif,sensiv,preci,npv))
}

# Get average performance
performance <- computePerformanceResults(selectedDf %>% select(Class, clsPred))
acc <- performance[1]
prec <- performance[4]
recall <- performance[3]
spec <- performance[2]
npv <- performance[5]
f1 <- (2 * recall * prec) / (recall + prec)
auc <- as.numeric(xgb_m$evaluation_log[xgb_m$best_iteration, "test_auc_mean"])

print(paste("Accuracy=", round(acc, 2)))
print(paste("Precision=", round(prec, 2)))
print(paste("Recall=", round(recall, 2)))
print(paste("Specificity=", round(spec, 2)))
print(paste("NPV=", round(npv, 2)))
print(paste("F1=", round(f1, 2)))
print(paste("AUC=", round(auc, 2)))
```

```{r}
# Importance
bst <- xgboost(   params               = param,
                  data = as.matrix(selectedDf %>% select(-c(Class, clsPred))) ,
                  label =  selectedDf$Class,
                  nrounds             = 100,
                  verbose             = F,
                  prediction          = T,
                  maximize            = F, # Change this value to F will help to run with more itineration
                  nfold               = n_folds,
                  metrics             = c("auc", "error"),
                  early_stopping_rounds = 50,
                  stratified            = T,
                  scale_pos_weight      = 1)
importanceDf <- xgb.importance(colnames(selectedDf %>% select(-c(Class, clsPred))), model = bst)
print(importanceDf)
```

```{r}
library(pROC)

dfROC <- pROC::roc(response = ifelse(selectedDf$Class==T, 1, 0),
               predictor = round(xgb_m$pred),
               levels=c(0, 1), direction = "<")

# it = which.max(xgb_m$evaluation_log$test_auc_mean)
# best.iter = xgb_m$evaluation_log$iter[it]
# best.iter 

plot(pROC::roc(response = ifelse(selectedDf$Class==T, 1, 0),
               predictor = round(xgb_m$pred),
               levels=c(0, 1), direction = "<"), 
     legacy.axes = TRUE,
     main="ROC Curve", 
     lwd=1.5) 
```


### Plot feature importance
```{r}
yAxis <- list(
  title = 'Importance',
  range=c(0.0, 1.0)
)
xAxis <- list(
  title = ''
)

importanceDf$Feature <- factor(importanceDf$Feature, levels = importanceDf[order(-Gain),]$Feature)
fig_Importance <- plot_ly(importanceDf, x = ~Feature, y = ~Gain, type = 'bar', name = 'Gain', width=600) %>%
  add_trace(y = ~Cover, name = 'Cover') %>% 
  add_trace(y = ~Frequency, name = 'Frequency') %>% 
  layout(yaxis = yAxis, xaxis=xAxis, barmode = 'group', title="Feature Importance") %>% 
  config(.Last.value, mathjax = 'cdn')

htmltools::tagList(fig_Importance)
```


